Contexte

On cherche à avoir si notre méthode de seuillage est robuste, on va le faire plein de fois et voir comment se recoupent les résultats.

Nous exécutons cette méthode en inférant un réseau sur les gènes DE entre cnF et CnF (1309 gènes), et en utilisant les valeurs d’expression des gènes dans les conditions cNF, cnF, CNF, CnF. (3*4 colonnes dans la matrice d’expression). On prend une pvalue (quantile à 0.001), on ne normalize pas les profils d’expression à une variance unitaire, et on prend 10 variables espionnes tirées dans des lois de Poisson de moyenne la moyenne globale.

source("Funtions/Network_functions.R")

load("./Data/DEGsListsFiltered.RData")
load("./Data/PlnTFBDRegulatorsList.RData")
load("./Data/normalized.count_At.RData")
load("./Data/OntologyAllGenes.RData")


pval = 0.001
# DE genes
genes <- DEGs[["cnF CnF"]]
print(length(genes))
[1] 1309
# expression data
normalized.count <- normalized.count[, grepl("F", colnames(normalized.count))]
head(normalized.count)
               cNF_3      cNF_2      cNF_1      cnF_2      cnF_1      cnF_3
AT1G01010 1404.48413 1137.14606 1370.49722 1234.59407  975.10562 1079.37253
AT1G01020  382.87379  322.15371  354.90475  337.93987  367.05113  392.30632
AT1G01030   28.53146   16.95546   23.33284   28.08096   36.30176   29.68805
AT1G01040 1947.50224 2044.82826 2109.77977 2181.60035 1918.95140 2193.73455
AT1G01050 1826.93381 1765.62838 1746.27871 2355.89598 2214.40739 2297.64271
AT1G01060   85.59438   99.47202  111.75201  130.72173   87.72925  113.45075
               CNF_1      CnF_2      CnF_1      CnF_3      CNF_3      CNF_2
AT1G01010 1464.91355 1068.19624 1184.65014 1186.87210 1342.09996 1267.10176
AT1G01020  387.67033  484.08540  459.57834  488.14901  366.77487  433.48218
AT1G01030   20.58427   44.35090   31.12682   40.89662   24.67096   26.39795
AT1G01040 1953.78985 2363.80881 2189.86332 2117.05265 1444.07324 2059.04036
AT1G01050 2173.35535 2176.02519 2019.58131 2223.20984 1898.01881 2045.14670
AT1G01060  192.11981   82.09635   79.64804   98.32591  159.53884  193.12187
# TFs
regressors = intersect(TF$AGI, genes)

Echantillonnage par shuffling des TFs

Les fonctions utilisées :

insertSpyVariableShuffle <- function(normalized.count, regressors = row.names(normalized.count), 
                              n_spies = 15){
  
  TFs <- sample(regressors, size = n_spies, replace = FALSE)
  
  spies <- data.frame(normalized.count[sample(rownames(normalized.count), size=n_spies),], 
                      row.names = paste0("spy_",seq(1:n_spies)))
  
  for(spy in seq(1:n_spies)){
    spies[spy,] <- sample(normalized.count[sample(regressors, size = 1),], size = dim(normalized.count)[2],
                          replace = FALSE)
  }
  
  
  normalized.count <- rbind.data.frame(normalized.count, spies)
  #print(tail(normalized.count))
  return(normalized.count)
}

getPvalue <- function(value, null_distribution){
  return(sum(null_distribution > value)/length(null_distribution))
}

getCorrectedPvalues <- function(null_distribution, importances_to_test, method = 'fdr'){
  pvalues <- sapply(importances_to_test, getPvalue, null_distribution = null_distribution)
  fdr <- p.adjust(pvalues, method = method)
  return(fdr)
}


genie3 <- function(normalized.count, regressors=NA, targets=NA, nTrees=1000, nCores=5, 
                fdr = 0.1, n_spies = 15, batches = FALSE){
  

#   ____________________________________________________________________________
#   batch geneie3                                                           ####
# with new spy variables for each run
  
  
  if(batches){
    mat <- matrix(nrow = length(regressors)+n_spies, ncol = length(targets))
    rownames(mat) <- c(regressors, paste0("spy_",seq(1:n_spies)))
    colnames(mat) <- targets
    if(length(targets) %% 2 == 0){
      for(i in seq(1,length(targets), by = 2)){
        
        normalized.count_ <- insertSpyVariableShuffle(normalized.count, regressors,
                                       n_spies = n_spies)
        
        regressors_ = c(regressors, rownames(normalized.count_)[grepl('spy_', rownames(normalized.count_))])
        
        mat[,targets[seq(i,i+1)]] <- GENIE3(as.matrix(normalized.count_), regulators = regressors_, 
                                       targets = targets[seq(i,i+1)], treeMethod = "RF", K = "sqrt", 
                                       nTrees = nTrees, nCores = nCores, verbose = F)
      }
    }
    if(length(targets) %% 2 == 1){
      for(i in seq(1,length(targets)-1, by = 2)){
        if(i+2 != length(targets)){
          normalized.count_ <- insertSpyVariableShuffle(normalized.count, regressors,
                                       n_spies = n_spies)
        
          regressors_ = c(regressors, rownames(normalized.count_)[grepl('spy_', rownames(normalized.count_))])
          
          b<-mat[,targets[c(i,i+1)]] <- a <- GENIE3(as.matrix(normalized.count_), regulators = regressors_, 
                                         targets = targets[seq(i,i+1)], treeMethod = "RF", K = "sqrt", 
                                         nTrees = nTrees, nCores = nCores, verbose = F)
        }
        if(i+2 == length(targets)){
          normalized.count_ <- insertSpyVariableShuffle(normalized.count, regressors,
                                       n_spies = n_spies)
        
          regressors_ = c(regressors, rownames(normalized.count_)[grepl('spy_', rownames(normalized.count_))])
          mat[,targets[seq(i,i+2)]] <- GENIE3(as.matrix(normalized.count_), regulators = regressors_, 
                                         targets = targets[seq(i,i+2)], treeMethod = "RF", K = "sqrt", 
                                         nTrees = nTrees, nCores = nCores, verbose = F)
        }
          
      }
    }
    
  }
  
  
#   ____________________________________________________________________________
#   classic genie3                                                          ####

  else{
    
    normalized.count <- insertSpyVariableShuffle(normalized.count, regressors,
                                       n_spies = n_spies)
  
    regressors <- intersect(rownames(normalized.count),regressors)
    regressors = c(regressors, rownames(normalized.count)[grepl('spy_', rownames(normalized.count))])
    
    mat <- GENIE3(as.matrix(normalized.count), regulators = regressors, targets = targets, 
               treeMethod = "RF", K = "sqrt", nTrees = nTrees, nCores = nCores, verbose = T)
  }
  

  
  spies <- c(mat[grepl('spy_', rownames(mat)),])
  tfs <- c(mat[!grepl('spy_', rownames(mat)),])
  
  # pvals <- sapply(tfs, getPvalue, null_distribution=spies)
  # fdr_ <- getCorrectedPvalues(spies, tfs, method = "fdr")
  # 
  # 
  
  d <- data.frame("Importance" = c(tfs, spies), Variable = c(rep("TF", length(tfs)), rep("Spy", length(spies))))
  p <- ggplot(data = d, aes(x = Importance, fill = Variable)) +
    geom_density(alpha = 0.3) + geom_vline(xintercept = quantile(spies, probs = 1 - pval),
                                           linetype="dotted", size=1.5)

  print(p + ggtitle("Distribution des importances pour les variables espionnes et les vrais TFs"))
  print(p + ylim(c(0,5)) + ggtitle("Zoom sur la partie avec le seuil (quantile à  1/1000)"))
  # 
  # d <- data.frame("p_values" = c(pvals, fdr_), Correction = c(rep("none", length(pvals)), rep("fdr", length(fdr_))))
  # p <- ggplot(data = d, aes(x = p_values, fill = Correction)) +
  #   geom_density(alpha = 0.3)+ geom_vline(xintercept = fdr,
  #                                          linetype="dotted", size=1.5)
  # print(p + ggtitle("Distribution des p values, avec correction BH (fdr) ou non"))

  
  links <- getLinkList(mat)
  links <- links[!grepl('spy_', links[,1]),]
  links$fdr <- getCorrectedPvalues(null_distribution = spies, 
                                   importances_to_test = links$weight,
                                   method = "fdr")
  

  d <- data.frame("p_values" = links$fdr)
  
  p <- ggplot(data = d, aes(x = p_values)) +
    geom_density(alpha = 0.3)+ geom_vline(xintercept = fdr, linetype="dotted", size=1.5)
  print(p + ggtitle("Distribution des p values, avec correction BH (fdr)"))
  
 
  print(fdr)
  print(sum(links$fdr < fdr))
  
  
  head(sort(links$fdr))
  links <- links[links$fdr < fdr,]


  print(paste0("Number of links : ", dim(links)[1]))
  g <- graph_from_data_frame(links, directed = T)
  return(g)
}


genie3_no_spy <- function(normalized.count, regressors=NA, targets=NA, nTrees=1000, nCores=5, 
                fdr = 0.1, n_spies = 15, batches = FALSE){
  # normalized.count <- insertSpyVariableShuffle(normalized.count, regressors,
  #                                    n_spies = n_spies)
  # 
  # regressors <- intersect(rownames(normalized.count),regressors)
  regressors = c(regressors, rownames(normalized.count)[grepl('spy_', rownames(normalized.count))])
  
  mat <- GENIE3(as.matrix(normalized.count), regulators = regressors, targets = targets, 
             treeMethod = "RF", K = "sqrt", nTrees = nTrees, nCores = nCores, verbose = T)

  

  spies <- c(mat[grepl('spy_', rownames(mat)),])
  tfs <- c(mat[!grepl('spy_', rownames(mat)),])
  
  # pvals <- sapply(tfs, getPvalue, null_distribution=spies)
  # fdr_ <- getCorrectedPvalues(spies, tfs, method = "fdr")
  # 
  # 
  
  d <- data.frame("Importance" = c(tfs, spies), Variable = c(rep("TF", length(tfs)), rep("Spy", length(spies))))
  p <- ggplot(data = d, aes(x = Importance, fill = Variable)) +
    geom_density(alpha = 0.3) + geom_vline(xintercept = quantile(spies, probs = 1 - pval),
                                           linetype="dotted", size=1.5)

  print(p + ggtitle("Distribution des importances pour les variables espionnes et les vrais TFs"))
  print(p + ylim(c(0,5)) + ggtitle("Zoom sur la partie avec le seuil (quantile à  1/1000)"))
  # 
  # d <- data.frame("p_values" = c(pvals, fdr_), Correction = c(rep("none", length(pvals)), rep("fdr", length(fdr_))))
  # p <- ggplot(data = d, aes(x = p_values, fill = Correction)) +
  #   geom_density(alpha = 0.3)+ geom_vline(xintercept = fdr,
  #                                          linetype="dotted", size=1.5)
  # print(p + ggtitle("Distribution des p values, avec correction BH (fdr) ou non"))

  
  links <- getLinkList(mat)
  links <- links[!grepl('spy_', links[,1]),]
  links$fdr <- getCorrectedPvalues(null_distribution = spies, 
                                   importances_to_test = links$weight,
                                   method = "fdr")
  

  d <- data.frame("p_values" = links$fdr)
  
  p <- ggplot(data = d, aes(x = p_values)) +
    geom_density(alpha = 0.3)+ geom_vline(xintercept = fdr, linetype="dotted", size=1.5)
  print(p + ggtitle("Distribution des p values, avec correction BH (fdr)"))
  
 
  print(fdr)
  print(sum(links$fdr < fdr))
  
  
  head(sort(links$fdr))
  links <- links[links$fdr < fdr,]


  print(paste0("Number of links : ", dim(links)[1]))
  g <- graph_from_data_frame(links, directed = T)
  return(g)
}

Iterations de la méthode

res <- list()

normalized.count <- insertSpyVariableShuffle(normalized.count, regressors, n_spies = 20)

for (i in seq(1:30)) {
    res[[i]] <- genie3_no_spy(normalized.count, regressors = regressors, targets = genes, 
        fdr = 0.15, n_spies = 20)
}

[1] 0.15
[1] 0
[1] "Number of links : 0"

[1] 0.15
[1] 7
[1] "Number of links : 7"

[1] 0.15
[1] 11
[1] "Number of links : 11"

[1] 0.15
[1] 4
[1] "Number of links : 4"

[1] 0.15
[1] 182
[1] "Number of links : 182"

[1] 0.15
[1] 4
[1] "Number of links : 4"

[1] 0.15
[1] 0
[1] "Number of links : 0"

[1] 0.15
[1] 94
[1] "Number of links : 94"

[1] 0.15
[1] 2
[1] "Number of links : 2"

[1] 0.15
[1] 1
[1] "Number of links : 1"

[1] 0.15
[1] 184
[1] "Number of links : 184"

[1] 0.15
[1] 0
[1] "Number of links : 0"

[1] 0.15
[1] 0
[1] "Number of links : 0"

[1] 0.15
[1] 0
[1] "Number of links : 0"

[1] 0.15
[1] 0
[1] "Number of links : 0"

[1] 0.15
[1] 0
[1] "Number of links : 0"

[1] 0.15
[1] 0
[1] "Number of links : 0"

[1] 0.15
[1] 2
[1] "Number of links : 2"

[1] 0.15
[1] 11
[1] "Number of links : 11"

[1] 0.15
[1] 0
[1] "Number of links : 0"

[1] 0.15
[1] 0
[1] "Number of links : 0"

[1] 0.15
[1] 2
[1] "Number of links : 2"

[1] 0.15
[1] 2
[1] "Number of links : 2"

[1] 0.15
[1] 1
[1] "Number of links : 1"

[1] 0.15
[1] 6
[1] "Number of links : 6"

[1] 0.15
[1] 72
[1] "Number of links : 72"

[1] 0.15
[1] 23
[1] "Number of links : 23"

[1] 0.15
[1] 23
[1] "Number of links : 23"

[1] 0.15
[1] 2
[1] "Number of links : 2"

[1] 0.15
[1] 0
[1] "Number of links : 0"
save(res, file = "D:/These/NetworkInference/networks_from_fdrs_unique_spies.RData")

Resultats

load("D:/These/NetworkInference/networks_from_fdrs_unique_spies.RData")

mat <- matrix(nrow = length(res), ncol = length(res))

# length(E(intersection(res[[6]], res[[3]])))


for (i in seq(1:length(res))) {
    for (j in seq(1:length(res))) {
        mat[i, j] = length(igraph::E(igraph::intersection(res[[i]], res[[j]])))
        
    }
}

library(reshape2)
library(ggplot2)

data <- melt(mat)
library(viridis)
ggplot(data, aes(x = Var1, y = Var2, fill = value)) + geom_tile() + scale_fill_viridis() + 
    ggtitle("Edges Intersection Length between graphs from different runs")

lenghts <- unlist(lapply(res, function(net) {
    return(length(igraph::E(net)))
}))
hist(lenghts, breaks = 30)

sd(lenghts)
[1] 48.8074

C’est vraiment pas robuste… Du tout. On va implémenter notre méthode de tirer de nouvelles espionnes pour chaque lot de deux gènes, et aussi voir en regroupant les TFs très corrélés en un seul gène.

Même avec notre méthode qui génère des nouvelles spies pour chaque lot de 2 gènes, on n’a pas atteint vraiment de robustesse.

On pourrait : essayer en prenant plus d’arbres, ou en ne prenant pas juste la racine des variables a chaque split.

Je vais aussi essayer de lancer avec les mêmes espionnes sur 30 runs, pour voir quelle est la part de la génération des espionnes dans la variabilité du résultat. Si ça se trouve, c’est pas la faute des espionnes.